---
title: "Codebook for: PLOS Qualitative Data Availibity"
output:
  html_document:
    toc: true
    toc_depth: 4
    toc_float: true
    code_folding: 'hide'
    self_contained: true
---

This is the codebook for the data in `plos_cleaned_data.rds`. Basic metadata was originally retrieved from PLOS and then augmented with metadata retrieved using the PLOS API and the `retrieve-metadata.R` script as well as citation metadata from OpenAlex and then cleaned using the `cleaning_recoding.R` script, which generates the version in this codebook. Variables are marked as `(coded)` where they are the result of human coding (see paper methods and appendix for details). Variables are marked as `(generated)` when they're automatically extracted or generated. All other variables are metadata and come directly from the PLOS API. 

```{r setup}
knitr::opts_chunk$set(
  warning = TRUE, # show warnings during codebook generation
  message = TRUE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())

```



```{r prepare_codebook, warning=FALSE, results='hide', message=FALSE}
# Load required libraries
library(codebook)
library(labelled)
library(dplyr)
library(haven)
library(readr)

df = read_rds("../data/analysis/plos_cleaned_data.rds")

df$type_other = as.logical(df$type_other)
df$method_other = as.logical(df$method_other)



# Create a copy of the dataframe with appropriate variable labels
df_labelled <- df |>
  set_variable_labels(
    # Document identification
    url = "URL of the journal article's XML version",
    doi = "Digital Object Identifier of article",
    article_title = "Title of the article",

    # Publication metadata
    pub_date = "Publication date (MM/DD/YY)",
    journal = "Journal name",
    has_supplement = "Whether the article has supplementary material",

    # Author information
    authors = "List of authors",
    affiliations = "Author affiliations",
    first_author_affil = "First author's affiliation",
    first_author_country = "First author's country (extracted)",
    first_author_continent = "First author's continent (generated)",

    # Data type variables
    data_type = "Type(s) of data used in the study (coded)",
    methodology = "Methodological approach(es) used",

    # Data availability variables
    data_availability = "Data availability statement as included in the article",
    data_cannot_share = "Data cannot be shared publicly (coded)",
    data_on_request = "Data available on request (coded)",
    data_in_paper_andor_SI = "Data availability statement mentions either supplementary material or paper (originally coded; can also be generated from following three variables)",
    data_in_SI_alone = "Data are included in the supplementary material (coded)",
    data_in_paper_alone = "Data included in the paper (coded)",
    data_in_paper_SI = "Data included in the paper or supplementary material (coded)",
    data_in_repository = "Data available in a repository (coded)",
    data_combination = "DAS includes at least two data sharing methods including on request, in repository, and in paper and/or SI",
    data_other = "Other description of data availability (coded)",

    # Repository specific variables
    data_available_full = "For data in repository: Full data available (coded)",
    data_available_partial_qual = "For data in repository: Partial qualitative data available (coded)",
    data_available_quant = "For data in repository: Quantitative data available (coded)",
    data_available_documentation = "For data in repository: Documentation for the data is available (coded)",
    data_not_available = "For data in repository: Data not available (coded)",

    # Request specific variables
    on_request_authors = "Data available on request from authors (coded)",
    on_request_IRB_ethics_board = "Data available on request from IRB/ethics board (coded)",
    on_request_original_source = "Data available on request from original source (coded)",
    on_request_author_institution = "Data available on request from author's institution (coded)",

    # Content elements
    abstract = "Article abstract",
    funding = "Funding information",
    funding_binary = "Whether the study received funding (extracted based on funding variable)",
    cited_by_count = "Number of citations (from openAlex)",

    # Methodology types (all boolean, generated from methodology)
    method_thematic_analysis = "Thematic analysis used",
    method_mixed_methods = "Mixed methods approach used",
    method_content_analysis = "Content analysis used",
    method_framework_analysis = "Framework analysis used",
    method_coding = "Coding approach used",
    method_grounded_theory_analysis = "Grounded theory analysis used",
    method_phenomenological_analysis = "Phenomenological analysis used",
    method_constant_comparative_analysis = "Constant comparative analysis used",
    method_descriptive_analysis = "Descriptive analysis used",
    method_ethnography = "Ethnography used",
    method_other = "Other methodology used",

    # Data type variables (all boolean, generated from data_type)
    type_interviews = "Interview data used",
    type_focus_groups = "Focus group data used",
    type_surveys = "Survey data used",
    type_field_notes = "Field notes used",
    type_archival_documents = "Archival documents used",
    "type_quantitative_data_(non-survey)" = "Quantitative data (non-survey) used",
    "type_social_media/blog_posts" = "Social media/blog posts used",
    type_photographs = "Photographs used",
    type_news_sources = "News sources used",
    type_pedagogical_materials = "Pedagogical materials used",
    type_other = "Other data types used",

    # Notes fields
    `CODING NOTES` = "misc notes on coding",
    `Notes for data_available` = "misc notes on data availability for repository data"
  )

# Add value labels to boolean variables
# This creates a template that applies the same labels to multiple variables
add_boolean_labels <- function(df, pattern) {
  vars <- names(df)[grepl(pattern, names(df)) & sapply(df, is.logical)]
  for (var in vars) {
    df <- df |>
      set_value_labels(
        !!sym(var) := c(
          "No" = FALSE,
          "Yes" = TRUE
        )
      )
  }
  return(df)
}

# Apply boolean labels to relevant variable groups
df_labelled <- df_labelled |>
  add_boolean_labels("^data_") |>
  add_boolean_labels("^on_request_") |>
  add_boolean_labels("^method_") |>
  add_boolean_labels("^type_") |>
  add_boolean_labels("has_supplement") |>
  add_boolean_labels("funding_binary")

```



```{r codebook}
codebook(df_labelled, detailed_variables = FALSE, missingness_report = FALSE)
```
